import re

#根据文件明生成id
def extract_info_from_filename(filename):
    # 匹配年份和期数的正则表达式
    # pattern1 = re.compile(r'《([^》]+)》(\d{4})年第(\d+)\S+(\d+)期')
    # match = re.search(r'(\d{4})年(?:第)?(\d+[-—]?\d*)[期总]*', text)
    filename = filename.replace(' ', '')
    pattern1 = re.compile(r'《(文蚌|勇罕)》(\d{4})年(?:第)?(\d+)\S+(\d+)期')
    pattern2 = re.compile(r'《(文蚌|勇罕)》(\d{4})年(?:第)?(\d+)期')

    
    # 使用正则表达式进行匹配
    match1 = pattern1.match(filename)
    match2 = pattern2.match(filename)

    if match1:
        # 获取年份和期数的匹配组
        shelf =  match1.group(1)
        shelfId = "00" if shelf == "勇罕" else "01"
        year = match1.group(2)
        issue_start = match1.group(3)
        issue_end = match1.group(4)
        return year+shelfId+issue_start+":1"
    elif match2:
        shelf =  match2.group(1)
        shelfId = "00" if shelf == "勇罕" else "01"
        # 获取年份和期数的匹配组
        year = match2.group(2)
        issue = match2.group(3)

        return year+shelfId+issue+":2"
    else:
        return None

# id = extract_info_from_filename("《文蚌》2001年第12期总78期.pdf")
id = extract_info_from_filename("《勇罕》2015年 第3期.pdf")
print(id)



